
import subprocess
from bs4 import BeautifulSoup

# 转换为 HTML
input_doc ='input.docx'
input_doc ="/Users/emery/Downloads/语料素材/1-概览/test-doc.docx"



subprocess.run(['pandoc', '-f', 'docx', '-t', 'html', input_doc , '-o', 'temp.html'])

# 处理 HTML
with open('temp.html', 'r', encoding='utf-8') as f:
    soup = BeautifulSoup(f, 'html.parser')

# 在这里添加自定义的 HTML 处理逻辑
# 例如：调整标题级别，处理特定的样式等

# 将修改后的 HTML 保存
with open('processed.html', 'w', encoding='utf-8') as f:
    f.write(str(soup))

# 将处理后的 HTML 转换为 Markdown
subprocess.run(['pandoc', '-f', 'html', '-t', 'markdown_strict+pipe_tables+table_captions+yaml_metadata_block', 'processed.html', '-o', 'output.md'])


#
